Packages loading
library(tidyverse)
library(latex2exp)
library(jsonlite)This section introduces the setup of a simulated example with three scenarios. We compute theoretical values for the five premium benchmarks, visualize them to aid interpretation, and simulate data for use in subsequent sections.
This material complements Section 4.1 of the main paper, linked from the supplement’s introduction and on the left margin of the present interface.
We begin with loading a few packages:
Consider potential explanatory variables \(X_1\), \(X_2\), \(D\) to predict \(Y\). Let \(X_1\) be normally distributed with mean \(1\) and variance \(9\). Let \(X_2\) be a discrete random variable \(P(X_2 = k) = 1/4\), for \(k \in \{1, 2, 3, 4\}\). Assume \(D\) is a binary sensitive characteristic with \(P(D = 1) = 1/2\).
To analyze proxy effects and potential unfairness, we examine three scenarios, each defined by a different combination of conditional distributions of \((Y|X_1, X_2, D)\) and of \((D|X_1, X_2).\)
For scenarios 1 and 2, the propensity for \(D = 1\) is the step function illustrated in the left panel of Figure 2.1. This propensity, \(P(D = 1|X_1 = x_1, X_2 = x_2)\), represents the likelihood that individuals with risk profile \((x_1, x_2)\) belong to group \(D = 1\). It governs the dependence between \(D\) and permitted covariates, that is, the core of proxy effects. In scenario 3, the shape of \(P(D = 1|X_1, X_2)\) is more complex as shown in the right panel of Figure 2.1.
For scenarios 1 and 2, specific equations of the propensity for \(D = 1\) is given by \[\begin{equation*} P(D = 1|X_1, X_2) = 0.05 + 0.1 X_2 + 0.4 \times \mathbf{1}\{X_1 > E(X_1)\}, \end{equation*}\] Where \(\mathbf{1}\{A\} = 1\) if \(A\) is true, and \(0\) otherwise. This is a step function with respect to \(X_2\), because \(X_2\) is discrete. For scenario 3, the propensity is \[\begin{equation*} P(D = 1|X_1, X_2) = F^{-1}_B(\text{expit}[0.05 \{X_1 - E(X_1)\}^2 - \delta]), \end{equation*}\] where \(\text{expit}(x) = \text{exp}(x)/\{1 +\text{exp}(x) \}\), the constant \(\delta\) upholds \(P(D = 1) = 0.5\), and \(B\) is a random variable with distribution \(\text{Beta}(\alpha = f(X_2), \beta = f(X_2))\), where \[\begin{equation*} f(x) = 999 \times \mathbf{1}\{x = 1\} + \mathbf{1}\{x = 2\} + 0.4 \times \mathbf{1}\{x = 3\} + 0.2 \times \mathbf{1}\{x = 4\}. \end{equation*}\]
prob_D_unscaled <- function(X1, X2, D = 1, params){
lowX1 <- X1 < params[['g_0']]
if(params[['scen']] == 3){
to_return_eta <- (0.05 * (X1 - params[['g_0']])^2)
to_return <- exp(to_return_eta)/(1 + exp(to_return_eta))
} else if(params[['scen']] %in% c(1,2)){
to_return <- 0.45 + 0.1 * X2 - 0.4 * lowX1
}
p1 <- to_return
if(length(D) == 1){
if(D == 1) p <- p1
if(D == 0) p <- 1 - p1
} else if(length(D) == length(p1)){
p <- ifelse(D == 1, p1, 1 - p1)
}
return(p)
}
prob_D_margin <- function(d, X2_val = 1:4, params){
poss_X2 <- 1:4
u_X1 <- seq(0.01, 0.99, length.out = 99)
poss_X1 <- qnorm(u_X1, mean = params[['g_0']],
sd = params[['sd_X']])
prob_table <- expand.grid('X2' = poss_X2,
'X1' = poss_X1)
prob_table$w <- 1/99 * 0.25
prob_table$prob <- prob_D_unscaled(prob_table$X1, prob_table$X2,
D = rep(1, nrow(prob_table)), params = params)
to_calc <- prob_table %>% filter(X2 %in% X2_val) %>%
mutate(w = w * 4/length(X2_val))
p1 <- sum(to_calc$w * to_calc$prob) / sum(to_calc$w)
return(
ifelse(d == 1, p1, 1- p1)
)
}
prob_D <- function(X1, X2, D = 1, params){
lowX1 <- X1 < params[['g_0']]
if(params[['scen']] == 3){
marg_x2 <- sapply(1:4, function(x2_value){
prob_D_margin(d = 1, X2_val = x2_value, params = params)
})
marg_x2_vec <- sapply(X2, function(x2_val){
marg_x2[x2_val]
})
unsc_probs <- prob_D_unscaled(X1 = X1,
X2 = X2,
D = 1,
params = params)
unstr_probs <- unsc_probs - (marg_x2_vec - 0.5)
qty_alpha <- ifelse(X2 == 1, 999,
ifelse(X2 == 3,
0.4,
ifelse(X2 == 4,
0.2, 1)))
to_return <- qbeta(unstr_probs, qty_alpha, qty_alpha)
} else if(params[['scen']] %in% c(1,2)){
to_return <- 0.45 + 0.1 * X2 - 0.4 * lowX1
}
p1 <- to_return
if(length(D) == 1){
if(D == 1) p <- p1
if(D == 0) p <- 1 - p1
} else if(length(D) == length(p1)){
p <- ifelse(D == 1, p1, 1 - p1)
}
return(p)
}For illustration, we assume that the claim propensity is normally distributed with \(\sigma^2 = 45\) and \[\begin{equation} \label{eq:ex_distY} E(Y|X_1, X_2, D) = 100 + 3 X_1 \{(1 - \gamma_D) + \gamma_D D\} + 15 D, \end{equation}\] with \(\gamma_D = 0\) (no interaction) for scenario 1 and \(\gamma_D = 0.5\) (interaction) for scenarios 2 and 3. Note that \(X_2\) is not a true risk factor for \(Y\) in all three scenarios.
We then define set of parameters consistent with the three scenarios.
parms_original <- list('b_0' = 100, # beta0 for P(Y|X, D)
'b_d' = 15, # betaD for P(Y|X, D)
'b_x' = 4, # betaX for P(Y|X, D)
'b_A' = 0, # betaA for P(Y|X, D) (moral hazard)
'sd_Y' = sqrt(45), # sd for P(Y|X, D) (gaussian)
'g_0' = 1, # gamma0 for P(X)
'int_d' = 0, # for interaction
'scen' = 1, # for propensity
'sd_X' = 3 # sd for P(X|D) (gaussian))
)
parms <- list('Scenario1' = parms_original,
'Scenario2' = {tmp <- parms_original; tmp[['scen']] <- 2; tmp[['int_d']] <- 0.5; tmp},
'Scenario3' = {tmp <- parms_original; tmp[['scen']] <- 3; tmp[['int_d']] <- 0.5; tmp})
rm(parms_original)We create function to calculate theoretically all pricing benchmarks, which involves multiple complex function for the theoretical expression of the corrective premium
## Inverse of the expectation, takes an expectation, D and a set of parameters as argument, returns corresponding 'x'.
inv_Esp_Y <- function(s, D, params){
(s - (params[['b_0']] + params[['b_d']] * D))/(
params[['b_x']] * ((1 - params[['int_d']]) + params[['int_d']] * D)
)
}
# Precompute a grid for p_d_x1
precompute_p_d_x1 <- function(params, x1_grid = seq(-12, 15, length.out = 300)) {
# Expand the grid
grid <- expand.grid(d = c(0, 1), x1 = x1_grid, x2 = 1:4)
# Compute probabilities and densities in a vectorized manner
grid$prob <- prob_D(X1 = grid$x1, X2 = grid$x2, D = grid$d, params = params)
#grid$dens <- dnorm(grid$x1, mean = params[['g_0']], sd = params[['sd_X']])
# Aggregate over X2
aggregated <- aggregate(prob ~ d + x1, data = grid, FUN = mean)
colnames(aggregated)[3] <- "p_d_x1" # Rename the result column
return(aggregated)
}
create_p_d_x1_interpolators <- function(grid) {
# Split the grid by d
grid_split <- split(grid, grid$d)
# Create interpolation functions for d = 0 and d = 1
interpolators <- lapply(grid_split, function(subgrid) {
approxfun(subgrid$x1, subgrid$p_d_x1,
rule = c(2, 2)) # Rule = 2 ensures extrapolation
})
return(interpolators)
}
p_d_x1 <- function(d, x1, params, interpolator) {
integrand <- function(x1_val){
interpolator(x1_val) * dnorm(x1_val,
mean = params[['g_0']],
sd = params[['sd_X']])
}
# Perform numerical integration over x1
lower_bound <- pmin(x1 - 2, qnorm(0.0005, mean = params[['g_0']], sd = params[['sd_X']]))
upper_bound <- pmin(x1, qnorm(0.99999, mean = params[['g_0']], sd = params[['sd_X']]))
result <- pracma::quad(Vectorize(integrand),
xa = lower_bound,
xb = upper_bound)
return(result)
}
F_S_D <- function(s, D, params, interpolators) {
# Compute x1(s)
x1s <- inv_Esp_Y(s, D = D, params = params)
# Compute P(X1 < x1s)
# p_x1s <- pnorm(x1s, mean = params[['g_0']], sd = params[['sd_X']])
to_return <- mapply(
function(d, x1){
interp <- interpolators[[as.character(d)]]
p_d_x1(d, x1, params,
interpolator = interp) /
p_d_x1(d, 99, params,
interpolator = interp)},
D, x1s
)
# Compute F_S_D
return(to_return)
}
inverse_F_S_D <- function(p, D, params, tolerance = 1e-6, max_iter = 100, interpolators) {
# Validate p
if (any(p < 0 | p > 1)) stop("p must be between 0 and 1")
# Define the root-finding function
find_root <- function(one_p, one_D) {
tryCatch({
uniroot(
function(s) F_S_D(s, one_D, params, interpolators) - one_p, # Use interpolators in F_S_D
interval = c(50, 165), # Specify the interval for s
tol = tolerance,
maxiter = max_iter
)$root
}, error = function(e) NA) # Return NA if uniroot fails
}
# Vectorize over both p and D using mapply
return(mapply(find_root, one_p = p, one_D = D))
}
maps_to_corr_theo <- function(mu_B, D, params = params){
# Interpolate p_d_x1 for each D and x1s
# Compute x1(s)
x1_grid <- seq(qnorm(0.0005, mean = params[['g_0']],
sd = params[['sd_X']]),
qnorm(0.99999, mean = params[['g_0']],
sd = params[['sd_X']]),
length.out = 100)
precomputed_grid <- precompute_p_d_x1(params, x1_grid)
# Step 2: Create interpolators
interpolators <- create_p_d_x1_interpolators(precomputed_grid)
u_d <- pmin(pmax(F_S_D(mu_B, D, params,
interpolators),
0.005), 0.995)
## mu_A = E(mu_B) unconditional
inverse_values <- lapply(0:1, function(d){
inverse_F_S_D(p = u_d, D = rep(d, length(u_d)),
params = params,
interpolators = interpolators)
})
rowSums(do.call(cbind, inverse_values)) * 0.5
}levels_for_premiums <- c("mu_B", "mu_U", "mu_A", "mu_H", "mu_C")
labels_for_premiums <- c("$\\mu^B$","$\\mu^U$", "$\\mu^A$", "$\\mu^H$", "$\\mu^C$")
## four ingredients for the 5 families of premiums
premium_generator <- function(best, pdx, maps_to_corr, marg){
list("mu_B" = function(x1, x2, d){
## simple call of the best estimate model
best(data.frame(X1 = x1,
X2 = x2,
D = d))
}, "mu_U" = function(x1, x2, d = NULL){
## Explicit inference : mu_U = E(mu_B|X)
tab_best <- sapply(0:1, function(dl){
best(data.frame(X1 = x1,
X2 = x2,
# X3 = x3,
D = rep(dl, length(x1))))
})
tab_pdx <- sapply(0:1, function(dl){
pdx(data.frame(X1 = x1,
X2 = x2,
D = rep(dl, length(x1))))
})
(tab_best * tab_pdx) %>% apply(1, sum)
}, "mu_A" = function(x1, x2, d = NULL){
## mu_A = E(mu_B) unconditional
sapply(0:1, function(d){best(data.frame(X1 = x1,
X2 = x2,
D = rep(d, length(x1))))*
marg[d + 1]}) %>% apply(1, sum)
}, "mu_H" = function(x1, x2, d = NULL){
## Here we cheated by using our mapping of mu_C
## explicit inference of mu_C : mu_H = E(mu_C|X)
tab_corr <- sapply(0:1, function(dl){
sb <- best(data.frame(X1 = x1,
X2 = x2,
D = rep(dl, length(x1))))
maps_to_corr(data.frame(mu_B = sb, D = dl))
})
tab_pdx <- sapply(0:1, function(dl){
pdx(data.frame(X1 = x1,
X2 = x2,
D = rep(dl, length(x1))))
})
(tab_corr * tab_pdx) %>% apply(1, sum)
}, "mu_C" = function(x1, x2, d = NULL){
## mu_C = T^{d}(mu_B(x, d))
mu_b <- best(data.frame(X1 = x1,
X2 = x2,
# X3 = x3,
D = d))
maps_to_corr(data.frame(mu_B = mu_b, D = d))
})
}
levels_for_quants <- c('proxy_vuln', 'risk_spread', 'fair_range', 'parity_cost')
quant_generator <- function(premiums){
list('proxy_vuln' = function(x1, x2, d){
premiums$mu_U(x1 = x1, x2 = x2) -
premiums$mu_A(x1 = x1, x2 = x2)
},
'risk_spread' = function(x1, x2, x3, d){
to_minmax <- data.frame(risk1 = premiums$mu_B(x1 = x1,
x2 = x2,
d = 1),
risk0 = premiums$mu_B(x1 = x1,
x2 = x2,
d = 0))
apply(to_minmax, 1, max) - apply(to_minmax, 1, min)
},
'fair_range' = function(x1, x2, d){
to_minmax <- data.frame(mu_b = premiums$mu_B(x1 = x1, x2 = x2,
d = d),
mu_u = premiums$mu_U(x1 = x1, x2 = x2,
d = NULL),
mu_a = premiums$mu_A(x1 = x1, x2 = x2,
d = NULL),
mu_h = premiums$mu_H(x1 = x1, x2 = x2,
d = NULL),
mu_c = premiums$mu_C(x1 = x1, x2 = x2,
d = d))
apply(to_minmax, 1, max) - apply(to_minmax, 1, min)
},
'parity_cost' = function(x1, x2, d){
premiums$mu_C(x1 = x1, x2 = x2,
d = d) -
premiums$mu_B(x1 = x1, x2 = x2,
d = d)
})
}premiums_theo <- setNames(nm = names(parms)) %>% lapply(function(name){
best_fun_theo <- function(newdata){
Esp_Y(X1 = newdata$X1,
X2 = newdata$X2,
D = newdata$D,
params = parms[[name]])
}
pdx_fun_theo <- function(newdata){
prob_D(X1 = newdata$X1,
X2 = newdata$X2,
D = newdata$D,
params = parms[[name]])
}
maps_to_corr_fun_theo <- function(newdata){
maps_to_corr_theo(mu_B = newdata$mu_B,
D = newdata$D,
params = parms[[name]])
}
marg <- sapply(0:1, function(d){
if(name != 'Scenario3'){
prob_D_margin(d, params = parms[[name]])
} else {
0.5
}
})
premium_generator(best = best_fun_theo,
pdx = pdx_fun_theo,
maps_to_corr = maps_to_corr_fun_theo,
marg = marg)
})
quants_theo <- setNames(nm = names(parms)) %>% lapply(function(name){
quant_generator(premiums = premiums_theo[[name]])
})We apply the theoretical pricing functions from all families on a small grid dataset to visualize the results
All results were obtained on a single thread under R 4.4.2. Numerical integration for the following code chunk takes approximately one hour on an Intel(R) Core(TM) Ultra 7 165H CPU with 32 GB of RAM.
df_to_g_theo_file <- "preds/df_to_g_theo.json"
## If the folder do not exist...
if (!dir.exists('preds')) {
dir.create('preds')
}
# Check if the JSON file exists
if (file.exists(df_to_g_theo_file)) {
message(sprintf("[%s] File exists. Reading df_to_g from %s", Sys.time(), df_to_g_theo_file))
df_to_g_theo <- fromJSON(df_to_g_theo_file)
} else {
df_to_g_theo <- setNames(nm = names(parms)) %>% lapply(function(name) {
message(sprintf("[%s] Processing: %s", Sys.time(), name))
# Start time for this scenario
start_time <- Sys.time()
# Compute theoretical premiums
message(sprintf("[%s] Step 3: Computing theoretical premiums", Sys.time()))
theo_premiums <- setNames(object = levels_for_premiums, nm = paste0(levels_for_premiums, '_t')) %>%
sapply(function(s) {
message(sprintf("[%s] Computing theoretical premium: %s", Sys.time(), s))
premiums_theo[[name]][[s]](
x1 = grid_to_test$x1,
x2 = grid_to_test$x2,
d = grid_to_test$d
)
})
# Compute theoretical PDX
message(sprintf("[%s] Step 4: Computing theoretical PDX", Sys.time()))
pdx_t_results <- prob_D(
X1 = grid_to_test$x1,
X2 = grid_to_test$x2,
D = grid_to_test$d,
params = parms[[name]]
)
# Combine results
message(sprintf("[%s] Step 5: Combining results", Sys.time()))
result <- data.frame(
grid_to_test,
theo_premiums,
pdx_t = pdx_t_results
)
# Log completion time
end_time <- Sys.time()
message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
return(result)
})
# Save the entire df_to_g object to JSON
message(sprintf("[%s] Saving df_to_g_theo to %s", Sys.time(), df_to_g_theo_file))
toJSON(df_to_g_theo, pretty = TRUE, auto_unbox = TRUE) %>% write(df_to_g_theo_file)
}Related section
The local fairness metrics are discussed in Chapter 3 of this online supplement and defined in Section 5 of the main paper
grid_stats_path_theo <- 'preds/preds_grid_stats_theo.json'
# Check and load or compute preds_grid_stats
if (file.exists(grid_stats_path_theo)) {
preds_grid_stats_theo <- fromJSON(grid_stats_path_theo)
} else {
preds_grid_stats_theo <- setNames(nm = names(parms)) %>%
lapply(function(name) {
data.frame(df_to_g_theo[[name]],
'risk_spread_t' = quants_theo[[name]][['risk_spread']](x1 = df_to_g_theo[[name]]$x1,
x2 = df_to_g_theo[[name]]$x2,
d = df_to_g_theo[[name]]$d),
'proxy_vuln_t' = quants_theo[[name]][['proxy_vuln']](x1 = df_to_g_theo[[name]]$x1,
x2 = df_to_g_theo[[name]]$x2,
d = df_to_g_theo[[name]]$d),
## computationally quicker (only) for the theoretical case because we want to avoid numerical integration when calling mu_C.
'fair_range_t' = (apply(df_to_g_theo[[name]][c('mu_B_t', 'mu_U_t', 'mu_A_t', 'mu_H_t', 'mu_C_t')], 1, max) -
apply(df_to_g_theo[[name]][c('mu_B_t', 'mu_U_t', 'mu_A_t', 'mu_H_t', 'mu_C_t')], 1, min)) |> unname(),
'parity_cost_t' = df_to_g_theo[[name]][['mu_C_t']] - df_to_g_theo[[name]][['mu_B_t']])
})
toJSON(preds_grid_stats_theo, pretty = TRUE, auto_unbox = TRUE) %>%
write(grid_stats_path_theo)
}to_save_pdx_t_perpop <- names(df_to_g_theo)[-1] %>%
lapply(function(name){
cols <- the_CAS_colors
pop_id <- which(names(df_to_g_theo) == name)
## keep only axis for last plot
if(pop_id == 2){ # If it's the last, apply correct xlabels
the_y_scale <- ylim(0,1)
the_y_label <- latex2exp::TeX("$P(D = 1|x_1, x_2)$")
} else { # otherwise, remove everything
the_y_scale <- scale_y_continuous(labels = NULL, limits = c(0,1))
the_y_label <- NULL
}
the_pops <- c(NA, 'Scenario 1 and 2', 'Scenario 3')
## lets graph
df_to_g_theo[[name]] %>%
mutate(the_population = name) %>%
filter(x1 >= -9, x1 <= 11,
d == 1) %>%
group_by(x1, x2) %>% summarise(pdx = mean(pdx_t)) %>% ungroup %>%
ggplot(aes(x = x1, y = pdx,
lty = factor(x2),
linewidth = factor(x2),
shape = factor(x2),
alpha = factor(x2),
color = factor(x2))) +
geom_line() +
scale_linetype_manual(values = c('solid', '31', '21', '11'), name = latex2exp::TeX('$x_2$')) +
scale_color_manual(values = cols, name = latex2exp::TeX('$x_2$')) +
scale_linewidth_manual(values = c(2, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +
scale_alpha_manual(values = c(0.65, 0.75, 0.85, 0.9), name = latex2exp::TeX('$x_2$')) +
labs(x = latex2exp::TeX("$x_1$"),
y = the_y_label,
title = latex2exp::TeX(the_pops[pop_id])) +
scale_x_continuous(breaks = c(-3:3)*3 + 1) + # see above
theme_minimal() +
the_y_scale +
theme(plot.title = element_text(hjust=0.5))
}) %>% ggpubr::ggarrange(plotlist = .,
nrow = 1,
widths = 15, heights = 1,
common.legend = T,
legend = 'right')
if (!dir.exists('figs')) dir.create('figs')
ggsave(filename = "figs/graph_pdx_t_perpop.png",
plot = to_save_pdx_t_perpop,
height = 3.25,
width = 7.55,
units = "in",
device = "png", dpi = 500)Figure 1.2 shows three premiums: best-estimate, unaware, and aware. The best-estimate premium in red correctly ignores \(X_2\) but directly discriminates on \(D\): for a given value of \(x_1\), an individual with \(d = 1\) (dashed) pays a higher premium than one with \(d = 0\) (solid). The unaware premium in orange exploits \(X_2\) (different line widths) to reveal information about the omitted \(D\), mirroring the shape of the propensity in Figure 2.1. By controlling for \(D\), the aware premium (green) prevents proxy effects without differentiating prices on \(D\) or \(X_2\).
to_save_premiumsdense_BUA_t_perpop <- names(df_to_g_theo) %>% lapply(function(pop_name){
## the colors
cols <- RColorBrewer::brewer.pal(5, 'Spectral')[1:3] %>% colorspace::darken(0.25)
# the_CAS_colors_full[c(6, 5, 2)]
pop_id <- which(names(parms) == pop_name)
## keep only axis for last plot
if(pop_name == head(names(df_to_g_theo), 1)){ # If it's the last, apply correct xlabels
the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
labels = scales::dollar,
limits = c(90, 140))
the_y_label <-
latex2exp::TeX("$\\mu(x_1, x_2, d)$")
} else { # otherwise, remove everything
the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
labels = NULL,
limits = c(90, 140))
the_y_label <- NULL
}
to_illustrate <- df_to_g_theo[[pop_name]] %>%
reshape2::melt(measure.vars = paste0(levels_for_premiums[1:3], '_t')) %>%
mutate(variable = factor(variable,
levels = paste0(levels_for_premiums[1:3], '_t'),
labels = labels_for_premiums[1:3])) %>%
filter(x1 <= 10, x1 >= -8)
# group_by(x1, d, variable) %>% summarise(value = mean(value)) %>% ungroup %>%
## mask part of the line
to_illustrate$value[to_illustrate$pdx_t < 0.1] <- NA
to_illustrate %>%
ggplot(aes(x = x1, y = value,
lty = factor(d),
linewidth = factor(x2),
shape = factor(x2),
alpha = factor(d),
color = factor(variable))) +
geom_line() +
scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +
scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) +
labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
title = paste0('Scenario ', pop_id)) +
scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
the_y_scale +
theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
ncol = 3,
widths = c(18, 15, 15), heights = 1,
common.legend = T,
legend = 'right')
ggsave(filename = "figs/graph_premiumsdense_BUA_t_perpop.png",
plot = to_save_premiumsdense_BUA_t_perpop,
height = 4,
width = 10.55,
units = "in",
device = "png", dpi = 500)Related section
The five families of the Spectrum of fairness are described in Section 4.2 of the main paper
Figure 1.3 illustrates the aware, hyperaware, and corrective premiums. The aware in olive green is the same as in Figure 1.2. The hyperaware in forest green uses \(X_2\) as a proxy for \(D\) to approach the corrective premium in blue. The latter transforms the best-estimate premium to enforce demographic parity. In Scenarios 1 and 2, this induces positive discrimination, reversing the order of \(d = 1\) (dashed) and \(d = 0\) (solid) curves compared to Figure 1.2. In Scenario 3, achieving parity requires only slight adjustments to the best-estimate, producing the crossing pattern in the third panel of Figure 1.3.
to_save_premiumsdense_AHC_t_perpop <- names(df_to_g_theo) %>% lapply(function(pop_name){
## the colors
cols <- RColorBrewer::brewer.pal(5, 'Spectral')[3:5] %>% colorspace::darken(0.25)
pop_id <- which(names(parms) == pop_name)
## keep only axis for last plot
if(pop_name == head(names(df_to_g_theo), 1)){ # If it's the last, apply correct xlabels
the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
labels = scales::dollar,
limits = c(90, 130))
the_y_label <-
latex2exp::TeX("$\\mu(x_1, x_2, d)$")
} else { # otherwise, remove everything
the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
labels = NULL,
limits = c(90, 130))
the_y_label <- NULL
}
to_illustrate <- df_to_g_theo[[pop_name]] %>%
reshape2::melt(measure.vars = paste0(levels_for_premiums[3:5], '_t')) %>%
mutate(variable = factor(variable,
levels = paste0(levels_for_premiums[3:5], '_t'),
labels = labels_for_premiums[3:5])) %>%
filter(x1 <= 10, x1 >= -8)
# group_by(x1, d, variable) %>% summarise(value = mean(value)) %>% ungroup %>%
## mask part of the line
to_illustrate$value[to_illustrate$pdx_t < 0.1] <- NA
to_illustrate %>%
ggplot(aes(x = x1, y = value,
lty = factor(d),
linewidth = factor(x2),
shape = factor(x2),
alpha = factor(d),
color = factor(variable))) +
geom_line() +
scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +
scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) +
labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
title = paste0('Scenario ', pop_id)) +
scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
the_y_scale +
theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
ncol = 3,
widths = c(18, 15, 15),
heights = 1,
common.legend = T,
legend = 'right')
ggsave(filename = "figs/graph_premiumsdense_AHC_t_perpop.png",
plot = to_save_premiumsdense_AHC_t_perpop,
height = 4,
width = 10.55,
units = "in",
device = "png", dpi = 500)We start with the generic simulating function, which takes desired number of simulation n and a set of parameters params as argument.
simulate_ex_art <- function(n = 1e6,
params){
## dependent (X, D)
X1 <- params[['g_0']] + params[['sd_X']] *
rnorm(n)
X2 <- sample(1:4, size = n, replace = TRUE)
pD <- prob_D(X1, X2, D = rep(1, n), params = params)
D <- rbinom(n, size = 1, prob = pD)
## Final elements
Y <- rnorm(n, mean = Esp_Y(X1, X2, D, params),
sd = params[['sd_Y']] + D)
## Ok goodbye now
to_return <- data.frame('D' = D,
'X1' = X1,
'X2' = X2,
'Y' = Y)
return(to_return)
}We simulate 1e5/1e4/1e4 observations for train/valid/test per scenario.
n <- 1e5
## the simulation files
files_sims <- paste0('simuls/',
c('train', 'valid', 'test'),
'_scenarios.json')
## If the folder do not exist...
if (!dir.exists('simuls')) {
dir.create('simuls')
}
## We read the simulations, or we do them and save them (json for general data format and easy communication with Python)
if(file.exists(files_sims[1])){# load
sims <- jsonlite::fromJSON(files_sims[1])
} else {# simulate and write
set.seed(321)
sims <- parms %>% lapply(function(parameters){
simulate_ex_art(n = n, params = parameters)
})
sims %>% jsonlite::toJSON(., pretty = TRUE) %>% write(files_sims[1])
}
if(file.exists(files_sims[2])){ # load
valid <- jsonlite::fromJSON(files_sims[2])
} else { # simulate and write
set.seed(123)
valid <- parms %>% lapply(function(parameters){
simulate_ex_art(n = n/10, params = parameters)
})
valid %>% jsonlite::toJSON(., pretty = TRUE) %>% write(files_sims[2])
}
if(file.exists(files_sims[3])){ # load
test <- jsonlite::fromJSON(files_sims[3])
} else { # simulate and write
set.seed(132)
test <- parms %>% lapply(function(parameters){
simulate_ex_art(n = n/10, params = parameters)
})
test %>% jsonlite::toJSON(., pretty = TRUE) %>% write(files_sims[3])
}file_sims <- 'simuls/sim_study.json'
if(file.exists(file_sims)){ # load
sim_samples <- jsonlite::fromJSON(file_sims)
} else { # simulate and write
set.seed(132)
sim_samples <- lapply(1:100, function(idx){
nsmall <- 2e3
list('train' = simulate_ex_art(n = nsmall, params = parms$Scenario1),
'valid' = simulate_ex_art(n = nsmall/4, params = parms$Scenario1),
'test' = simulate_ex_art(n = nsmall/4, params = parms$Scenario1))
})
names(sim_samples) <- paste0('sim_', 1:100)
sim_samples %>% jsonlite::toJSON(., pretty = TRUE) %>% write(file_sims)
}pop_to_g_theo_file <- "preds/pop_to_g_theo.json"
create_1d_interpolators <- function(grid_data, premium_names) {
# Create interpolators for each premium in premium_names
interpolators <- lapply(premium_names, function(premium_name) {
split(grid_data, list(grid_data$d, grid_data$x2)) %>%
lapply(function(group) {
# Ensure data is sorted by x1 for interpolation
group <- group[order(group$x1), ]
# Define bounds
min_x1 <- min(group$x1)
max_x1 <- max(group$x1)
# Create 1D interpolation function
interpolator <- approxfun(group$x1, group[[premium_name]], rule = 2) # Rule = 2 for extrapolation
# Wrap the interpolator with bounds checking
function(x) {
x <- pmin(pmax(x, min_x1), max_x1) # Clip x to the grid range
interpolator(x)
}
})
})
# Name the list by premium names
names(interpolators) <- premium_names
return(interpolators)
}
interpolate_to_simulated <- function(simulated_data, interpolators) {
# Initialize a list to store interpolated results for each premium
interpolated_results <- list()
# Iterate over premium names
for (premium_name in names(interpolators)) {
interpolated_premiums <- numeric(nrow(simulated_data))
# Iterate over unique (D, X2) combinations in the simulated data
unique_combinations <- unique(simulated_data[, c("D", "X2")])
for (i in seq_len(nrow(unique_combinations))) {
combo <- unique_combinations[i, ]
d_value <- combo$D
x2_value <- combo$X2
# Select the appropriate interpolator
interpolator_key <- paste(d_value, x2_value, sep = ".")
interpolator <- interpolators[[premium_name]][[interpolator_key]]
# Find rows matching this combination
matching_rows <- which(simulated_data$D == d_value & simulated_data$X2 == x2_value)
# Apply interpolation on X1 for these rows
interpolated_premiums[matching_rows] <- interpolator(simulated_data$X1[matching_rows])
}
# Store the interpolated results
interpolated_results[[premium_name]] <- interpolated_premiums
}
# Combine results into a data frame
interpolated_df <- do.call(cbind, interpolated_results)
return(interpolated_df)
}
# Check if the JSON file exists
if (file.exists(pop_to_g_theo_file)) {
message(sprintf("[%s] File exists. Reading pop_to_g_theo from %s", Sys.time(), pop_to_g_theo_file))
pop_to_g_theo <- fromJSON(pop_to_g_theo_file)
} else {
pop_to_g_theo <- setNames(nm = names(sims)) %>% lapply(function(name) {
message(sprintf("[%s] Processing: %s", Sys.time(), name))
# Start time for this simulation
start_time <- Sys.time()
list_data <- list('train' = sims[[name]],
'valid' = valid[[name]],
'test' = test[[name]])
result <- setNames(nm = names(list_data)) %>% lapply(function(nm){
data <- list_data[[nm]]
# Step 3: Compute theoretical premiums (EXCEPT muC AND muC, we interpolate here)
message(sprintf("[%s] Step 3: Computing theoretical premiums (no muC and muC)", Sys.time()))
theo_premiums_except_muH_muC <- setNames(object = setdiff(levels_for_premiums, c("mu_H", "mu_C")), nm = paste0(setdiff(levels_for_premiums, c("mu_H", "mu_C")), '_t')) %>%
sapply(function(s) {
message(sprintf("[%s] Computing theoretical premium: %s", Sys.time(), s))
premiums_theo[[name]][[s]](
x1 = data$X1,
x2 = data$X2,
d = data$D
)
})
# Step 3b: interpolate from the grid to get the muC_t and muC_t values
message(sprintf("[%s] Step 3b: Interpolating mu_C and mu_H from grid", Sys.time()))
interpolators <- create_1d_interpolators(df_to_g_theo[[name]], c("mu_H_t", "mu_C_t"))
interpolated_muH_muC_t <- interpolate_to_simulated(data, interpolators)
# Step 4: Compute theoretical PDX
message(sprintf("[%s] Step 4: Computing theoretical PDX", Sys.time()))
pdx_t_results <- prob_D(
X1 = data$X1,
X2 = data$X2,
D = data$D,
params = parms[[name]]
)
# Combine results
message(sprintf("[%s] Step 5: Combining results", Sys.time()))
data.frame(
data,
theo_premiums_except_muH_muC,
interpolated_muH_muC_t,
pdx_t = pdx_t_results
)
})
# Log completion time
end_time <- Sys.time()
message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
return(result)
})
# Save the entire df_to_g object to JSON
message(sprintf("[%s] Saving pop_to_g_theo to %s", Sys.time(), pop_to_g_theo_file))
toJSON(pop_to_g_theo, pretty = TRUE, auto_unbox = TRUE) %>% write(pop_to_g_theo_file)
}sims_to_g_theo_file <- "preds/sims_to_g_theo.json"
# Check if the JSON file exists
if (file.exists(sims_to_g_theo_file)) {
message(sprintf("[%s] File exists. Reading sims_to_g_theo from %s", Sys.time(), sims_to_g_theo_file))
sims_to_g_theo <- fromJSON(sims_to_g_theo_file)
} else {
sims_to_g_theo <- setNames(object = seq_along(sim_samples),
nm = names(sim_samples)) %>% lapply(function(idx) {
message(sprintf("[%s] Processing: %s", Sys.time(), paste0(idx, '/', length(sim_samples))))
samples_to_ret <- setNames(nm = names(sim_samples[[idx]])) %>% lapply(function(nm){
data <- sim_samples[[idx]][[nm]]
# Step 3: Compute theoretical premiums (EXCEPT muH AND muC, we interpolate here)
message(sprintf("[%s] Step 3: Computing theoretical premiums (no muC and muH)", Sys.time()))
theo_premiums_except_muH_muC <- setNames(object = setdiff(levels_for_premiums, c("mu_H", "mu_C")), nm = paste0(setdiff(levels_for_premiums, c("mu_H", "mu_C")), '_t')) %>%
sapply(function(s) {
message(sprintf("[%s] Computing theoretical premium: %s", Sys.time(), s))
premiums_theo[['Scenario1']][[s]](
x1 = data$X1,
x2 = data$X2,
d = data$D
)
})
# Step 3b: interpolate from the grid to get the SH_t and SC_t values
message(sprintf("[%s] Step 3b: Interpolating SC and SH from grid", Sys.time()))
interpolators <- create_1d_interpolators(df_to_g_theo[['Scenario1']], c("mu_H_t", "mu_C_t"))
interpolated_muH_muC_t <- interpolate_to_simulated(data, interpolators)
# Step 4: Compute theoretical PDX
message(sprintf("[%s] Step 4: Computing theoretical PDX", Sys.time()))
pdx_t_results <- prob_D(
X1 = data$X1,
X2 = data$X2,
D = data$D,
params = parms[['Scenario1']]
)
# Combine results
message(sprintf("[%s] Step 5: Combining results", Sys.time()))
result <- data.frame(
data,
theo_premiums_except_muH_muC,
interpolated_muH_muC_t,
pdx_t = pdx_t_results
)
})
return(samples_to_ret)
})
# Save the entire df_to_g object to JSON
message(sprintf("[%s] Saving sims_to_g_theo to %s", Sys.time(), sims_to_g_theo_file))
toJSON(sims_to_g_theo, pretty = TRUE, auto_unbox = TRUE) %>% write(sims_to_g_theo_file)
}pop_stats_path_theo <- 'preds/preds_pop_stats_theo.json'
# Check and load or compute preds_pop_stats
if (file.exists(pop_stats_path_theo)) {
preds_pop_stats_theo <- fromJSON(pop_stats_path_theo)
} else {
preds_pop_stats_theo <- setNames(nm = names(sims)) %>%
lapply(function(name) {
setNames(nm = names(pop_to_g_theo[[name]])) %>%
lapply(function(set) {
local_df <- pop_to_g_theo[[name]][[set]]
data.frame(local_df,
'risk_spread_t' = quants_theo[[name]][['risk_spread']](x1 = local_df$X1,
x2 = local_df$X2,
d = local_df$D),
'proxy_vuln_t' = quants_theo[[name]][['proxy_vuln']](x1 = local_df$X1,
x2 = local_df$X2,
d = local_df$D),
## computationally quicker (only) for the theoretical case because we want to avoid numerical integration when calling mu_C.
'fair_range_t' = (apply(local_df[c('mu_B_t', 'mu_U_t', 'mu_A_t', 'mu_H_t', 'mu_C_t')], 1, max) -
apply(local_df[c('mu_B_t', 'mu_U_t', 'mu_A_t', 'mu_H_t', 'mu_C_t')], 1, min)) |> unname(),
'parity_cost_t' = local_df[['mu_C_t']] - local_df[['mu_B_t']])
})
})
toJSON(preds_pop_stats_theo, pretty = TRUE, auto_unbox = TRUE) %>%
write(pop_stats_path_theo)
}sims_stats_path_theo <- 'preds/preds_sims_stats_theo.json'
# Check and load or compute preds_pop_stats
if (file.exists(sims_stats_path_theo)) {
preds_pop_stats_theo <- fromJSON(sims_stats_path_theo)
} else {
preds_sims_stats_theo <- setNames(nm = names(sims_to_g_theo)) %>%
lapply(function(idx) {
setNames(nm = names(sims_to_g_theo[[idx]])) %>%
lapply(function(set) {
local_df <- sims_to_g_theo[[idx]][[set]]
data.frame(local_df,
'risk_spread_t' = quants_theo[['Scenario1']][['risk_spread']](x1 = local_df$X1,
x2 = local_df$X2,
d = local_df$D),
'proxy_vuln_t' = quants_theo[['Scenario1']][['proxy_vuln']](x1 = local_df$X1,
x2 = local_df$X2,
d = local_df$D),
## computationally quicker (only) for the theoretical case because we want to avoid numerical integration when calling mu_C.
'fair_range_t' = (apply(local_df[c('mu_B_t', 'mu_U_t', 'mu_A_t', 'mu_H_t', 'mu_C_t')], 1, max) -
apply(local_df[c('mu_B_t', 'mu_U_t', 'mu_A_t', 'mu_H_t', 'mu_C_t')], 1, min)) |> unname(),
'parity_cost_t' = local_df[['mu_C_t']] - local_df[['mu_B_t']])
})
})
toJSON(preds_sims_stats_theo, pretty = TRUE, auto_unbox = TRUE) %>%
write(sims_stats_path_theo)
}